# ============================================================================
# Script: Justifying European Border Policies — Descriptives
# Author: Lisa Herbig
# Project: Justifying European Border Policies (JEPP)
# Date: 27 November 2025
# ============================================================================


# ------------------------------------------------------------------------------
# Libraries
# ------------------------------------------------------------------------------
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggnewscale)
library(lubridate)

# ------------------------------------------------------------------------------
# Output folders + helpers
# ------------------------------------------------------------------------------
setwd("~/OneDrive - UvA/1. Justification Paper/JEPP_Submission/JustifyingEuropeanBorderPolicies_SupplementaryMaterials")

dir.create("figures", showWarnings = FALSE, recursive = TRUE)
dir.create("tables",  showWarnings = FALSE, recursive = TRUE)

save_jpg <- function(plot_obj, filename, width = 9, height = 6, dpi = 300) {
  ggsave(filename = file.path("figures", filename),
         plot = plot_obj, width = width, height = height, dpi = dpi)
}
save_tbl <- function(df, filename) {
  write.csv(df, file.path("tables", filename), row.names = FALSE)
}

# ------------------------------------------------------------------------------
# Load data
# ------------------------------------------------------------------------------
dataset <- read_csv("~/OneDrive - UvA/1. Justification Paper/JEPP_Submission/JustifyingEuropeanBorderPolicies_SupplementaryMaterials/data/Justification_CODING_FINAL.csv")

# ------------------------------------------------------------------------------
# Quick diagnostics
# ------------------------------------------------------------------------------
total_documents <- nrow(dataset)

policy_columns <- c("POLICY1","POLICY2","POLICY3","POLICY4","POLICY5","POLICY6")
total_policies <- sum(sapply(dataset[, policy_columns], function(col) sum(!is.na(col))))


ANAME_cols <- c(paste0("P", 1:6, "J1_ANAME"),
                paste0("P", 1:6, "J2_ANAME"),
                paste0("P", 1:6, "J3_ANAME"))

#For all potential six policies not a single justification (J1 = First Justification)
docs_without_justification <- sum(
  is.na(dataset$P1J1_ANAME) &
    is.na(dataset$P2J1_ANAME) &
    is.na(dataset$P3J1_ANAME) &
    is.na(dataset$P4J1_ANAME) &
    is.na(dataset$P5J1_ANAME) &
    is.na(dataset$P6J1_ANAME)
)

percent_without_justification <- (docs_without_justification / total_documents) * 100
total_justifications_recorded <- sum(!is.na(as.matrix(dataset[, ANAME_cols]))) #Sum up justification columns (ANAME_cols) that are not NA

# Count how many justifications (0, 1, 2, 3) each policy receives
# Where a policy exists (POLICY1–POLICY6 not NA)
policy_mat  <- dataset[, policy_columns]
has_policy  <- !is.na(policy_mat)   # same dim as policy_mat, logical

# Matrix to store number of justifications per policy (rows = docs, cols = POLICY1–6)
num_just_mat <- matrix(0, nrow = nrow(dataset), ncol = 6)

for (p in 1:6) {
  jcols <- paste0("P", p, "J", 1:3, "_ANAME")  # e.g. P1J1_ANAME, P1J2_ANAME, P1J3_ANAME
  # Count non-NA justifications for this policy in each document
  num_just_mat[, p] <- rowSums(!is.na(dataset[, jcols]))
}

# Now restrict to cells where there actually is a policy
num_just_vec <- num_just_mat[has_policy]  # vector of 0,1,2,3 for each policy instance

policies_with_0 <- sum(num_just_vec == 0)
policies_with_1 <- sum(num_just_vec == 1)
policies_with_2 <- sum(num_just_vec == 2)
policies_with_3 <- sum(num_just_vec == 3)

percent_with_0 <- round(100 * policies_with_0 / total_policies, 2)
percent_with_1 <- round(100 * policies_with_1 / total_policies, 2)
percent_with_2 <- round(100 * policies_with_2 / total_policies, 2)
percent_with_3 <- round(100 * policies_with_3 / total_policies, 2)


#create table
diag_tbl <- tibble(
  total_documents = total_documents,
  docs_without_justification = docs_without_justification,
  percent_without_justification = round(percent_without_justification, 2),
  total_policies = total_policies,
  total_justifications_recorded = total_justifications_recorded,
  percent_with_0 = percent_with_0,
  policies_with_0 = policies_with_0,
  percent_with_1 = percent_with_1,
  policies_with_1 = policies_with_1,
  percent_with_2 = percent_with_2,
  policies_with_2 = policies_with_2,
  percent_with_3 = percent_with_3,
  policies_with_3 = policies_with_3
)
save_tbl(diag_tbl, "00_quick_diagnostics.csv")

# ------------------------------------------------------------------------------
# Figure 1: Distribution of justification types (percent of all recorded)
# ------------------------------------------------------------------------------
justification_types <- c("SEC","ECO","PH","HUM","LEG","EFE","SOL","EU")

# Robust row sums by pattern set
sum_justification_columns <- function(df, jt) {
  pattern1 <- paste0("_J_", jt, "$")
  pattern2 <- if (jt == "SOL") "_SOL$" else if (jt == "OA") "_S_O_B$" else NULL
  cols1 <- grep(pattern1, names(df), value = TRUE)
  cols <- if (!is.null(pattern2)) unique(c(cols1, grep(pattern2, names(df), value = TRUE))) else cols1
  df[[jt]] <- if (length(cols)) rowSums(df[, cols, drop = FALSE], na.rm = TRUE) else 0
  df
}
for (jt in justification_types) dataset <- sum_justification_columns(dataset, jt)

total_justifications <- sapply(justification_types, function(jt) sum(dataset[[jt]], na.rm = TRUE))
justification_sums_df <- data.frame(
  Justification = names(total_justifications),
  Total = as.numeric(total_justifications),
  row.names = NULL
)
justification_sums_df$Percent <- (justification_sums_df$Total / total_justifications_recorded) * 100
save_tbl(justification_sums_df, "01_justification_totals_and_percent.csv")

# Factor order and groups
justification_sums_df$Justification <- factor(
  justification_sums_df$Justification,
  levels = c("SEC","PH","HUM", "ECO","LEG","SOL","EFE","EU")
)
justification_sums_df$Group <- ifelse(
  justification_sums_df$Justification == "EU", "European Union Reference",
  ifelse(justification_sums_df$Justification %in% c("SEC","PH","HUM","ECO"),
         "Justification Aim", "Process Justification")
)

aim_df  <- subset(justification_sums_df, Group == "Justification Aim")
proc_df <- subset(justification_sums_df, Group == "Process Justification")
eu_df   <- subset(justification_sums_df, Group == "European Union Reference")

p_fig1 <- ggplot() +
  geom_bar(data = aim_df,
           aes(x = Justification, y = Percent, fill = Justification),
           stat = "identity") +
  scale_fill_manual(
    name = "Justification Aim",
    values = c("SEC"="#003366","PH"="#336699","HUM"="#6699CC", "ECO"="#99CCFF"),
    labels = c("SEC"="Security (SEC)","PH"="Public Health (PH)","HUM"="Humanitarian (HUM)","ECO"="Economy (ECO)")
  ) +
  ggnewscale::new_scale_fill() +
  geom_bar(data = proc_df,
           aes(x = Justification, y = Percent, fill = Justification),
           stat = "identity") +
  scale_fill_manual(
    name = "Process Justification",
    values = c("LEG"="#8b814c","SOL"="#CDBE70","EFE"="#EEDC82"),
    labels = c("LEG"="Legal (LEG)","SOL"="Solidarity (SOL)","EFE"="Effectiveness (EFE)")
  ) +
  ggnewscale::new_scale_fill() +
  geom_bar(data = eu_df,
           aes(x = Justification, y = Percent, fill = Justification),
           stat = "identity") +
  scale_fill_manual(
    name = "Overall References",
    values = c("EU"="#90EE90"),
    labels = c("EU"="EU Reference (EU)")
  ) +
  theme_minimal() +
  labs(x = "Justification", y = "Percentage (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank())
save_jpg(p_fig1, "paper_figure_1_distribution.jpg")


# ------------------------------------------------------------------------------
# Figure 2: Policy Type × Justification
# ------------------------------------------------------------------------------

# ------------------------------------------------------------------------------
# Shared prep for crosstabs
# ------------------------------------------------------------------------------
dataset_filtered <- dataset %>% select(-ends_with("_OTH"))

justifications_long <- dataset_filtered %>%
  pivot_longer(
    cols = matches("P\\d+J\\d+_(?:SOL|J_\\w+)$"),   # ← no S_O_B
    names_to = c("PolicyJustification","JustificationType"),
    names_pattern = "(P\\d+J\\d+)_(SOL|J_\\w+)$",    # ← no S_O_B
    values_to = "Value"
  ) %>%
  filter(Value == 1) %>%
  mutate(
    JustificationType = sub("^J_", "", JustificationType),   # only recode J_*
    Policy = sub("J[1-3]$", "", PolicyJustification),
    Policy = gsub("(_\\w+)$", "", Policy)
  ) %>%
  select(DOC_ID, DATE, Policy, JustificationType)


# ------------------------------------------------------------------------------
# 1.2 Policy Type × Justification (row proportions)
# ------------------------------------------------------------------------------
policy_types_resolved <- dataset_filtered %>%
  pivot_longer(
    cols = grep("P\\d+_POL_(CON|VISA|SEC|QUO|ASY)$", names(dataset_filtered), value = TRUE),
    names_to = c("Policy","PolicyType"),
    names_sep = "_POL_",
    values_to = "PolicyTypeValue"
  ) %>%
  filter(PolicyTypeValue == 1) %>%
  distinct(DOC_ID, Policy, PolicyType) %>%
  group_by(DOC_ID, Policy) %>%
  summarise(
    PolicyType_list = paste(sort(unique(PolicyType)), collapse = "; "),
    PolicyType = first(PolicyType),
    PolicyType_is_multi = n() > 1,
    .groups = "drop"
  )

merged_data <- left_join(
  justifications_long, policy_types_resolved,
  by = c("DOC_ID","Policy"), relationship = "many-to-one"
)

base_for_tab <- merged_data %>%
  mutate(PolicyType_expanded = ifelse(PolicyType_is_multi, PolicyType_list, PolicyType)) %>%
  separate_rows(PolicyType_expanded, sep = "\\s*;\\s*") %>%
  filter(!is.na(PolicyType_expanded), !is.na(JustificationType)) %>%
  transmute(DOC_ID, Policy,
            PolicyType = PolicyType_expanded,
            JustificationType) %>%
  distinct()

justification_policy_tab  <- table(base_for_tab$PolicyType, base_for_tab$JustificationType)
justification_policy_prop <- prop.table(justification_policy_tab, margin = 1)

# Save tables
save_tbl(as.data.frame(as.table(justification_policy_tab)),  "02_policyType_x_justification_counts.csv")
save_tbl(as.data.frame(as.table(justification_policy_prop)), "02_policyType_x_justification_row_props.csv")

df_justification_policy_prop_long <- as.data.frame(as.table(justification_policy_prop))
names(df_justification_policy_prop_long) <- c("PolicyType","JustificationType","Proportion")
df_justification_policy_prop_long$JustificationType <- factor(
  df_justification_policy_prop_long$JustificationType,
  levels = c("SEC","PH","HUM", "ECO","LEG","SOL","EFE","EU")
)
df_justification_policy_prop_long$PolicyType <- factor(
  df_justification_policy_prop_long$PolicyType,
  levels = c("SEC","VISA","CON","QUO","ASY")
)

p_fig2 <- ggplot(df_justification_policy_prop_long,
                 aes(x = JustificationType, y = PolicyType, fill = Proportion)) +
  geom_tile() +
  scale_fill_gradient(low = "#EEDC82", high = "#6699CC") +
  scale_y_discrete(
    limits = rev(c("VISA","SEC","QUO","CON","ASY")),
    labels = c("VISA"="Visa","SEC"="Securitization","QUO"="Quota","CON"="Controls","ASY"="Asylum")
  ) +
  labs(x = "Justification", y = "Policy Type", fill = "Proportion") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
save_jpg(p_fig2, "paper_figure_2_policyType_heatmap.jpg")

# ------------------------------------------------------------------------------
# Counts by month, year, half-year (documents with coding)
# ------------------------------------------------------------------------------

# 1. Parse DATE robustly (handles both "mm/dd/YYYY" and "YYYY-mm-dd")
dataset <- dataset %>%
  mutate(
    DATE_clean = suppressWarnings(lubridate::mdy(DATE)),
    DATE_clean = if_else(is.na(DATE_clean),
                         suppressWarnings(lubridate::ymd(DATE)),
                         DATE_clean),
    Year  = year(DATE_clean),
    Month = month(DATE_clean),
    Month_Year = format(DATE_clean, "%Y-%m"),
    Half  = if_else(Month <= 6, "H1", "H2"),
    Half_Year = paste(Year, Half)
  )

# 2. Counts (one row per time unit)
monthly_counts <- dataset %>%
  group_by(Month_Year) %>%
  summarise(N = n(), .groups = "drop")

yearly_counts <- dataset %>%
  group_by(Year) %>%
  summarise(N = n(), .groups = "drop")

halfyear_counts <- dataset %>%
  group_by(Half_Year) %>%
  summarise(N = n(), .groups = "drop")

# 3. Save tables in the same style
save_tbl(monthly_counts,  "monthly_justification_counts.csv")
save_tbl(yearly_counts,   "yearly_justification_counts.csv")
save_tbl(halfyear_counts, "halfyear_justification_counts.csv")

# End
